library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
The SNAP enrollment data was acquired from the American Community Survey 5-year estimates of 2022 from the Census Bureau. The original dataset contained estimates for SNAP/Food stamp enrollment at the census tract level as well as other households attributes at the census tract level: estimate and percent for distribution of race and ethnicity, poverty status, disability status, presentation of children under age of 18, etc.
We selected the variables that were relevant to our analysis to build the final dataset, the final variables included were the following:
r geoid: the geographic id for identification of census
tracttotal_ct_households: estimate total count for number of
householdsph_with_children: estimate percentage of households
with children under 18 yearsph_no_children: estimate percentage of households
without children under 18 yearsph_below_poverty:estimate percentage of households
below poverty level in the past 12 monthsph_above_poverty: estimate percentage of households at
or above poverty level in the past 12 monthsph_disability: estimate percentage of households with
one or more people with disabilityph_no_disability: estimate percentage of households
with no person with disabilityph_white: estimate percent households race and Hispanic
or Latino origin of householder, White aloneph_black: estimate percent households race and Hispanic
or Latino origin of householder, Black or African American aloneph_asian: estimate percent households race and Hispanic
or Latino origin of householder, Asian aloneph_other race: estimate percent households race and
Hispanic or Latino origin of householder, other races (derived from the
estimate value for White, Black and Asian race groups)ph_hispanic: estimate percent households race and
Hispanic or Latino origin of householder, Hispanic or Latino origin of
any raceph_non_hispanic: estimate percent households race and
Hispanic or Latino origin of householder, not of Hispanic or Latino
origin of any race (derived from the estimate value for Hispanic or
Latino origin of any race)ph_no_work:estimate percent of families with no workers
in past 12 monthsph_1_work: estimate percent of families with 1 worker
in past 12 monthsph_2_work: estimate percent of families with 2 or more
workers in past 12 monthsph_snap: estimate percent of households receiving
SNAP/Food StampSNAP =
read_csv(file = "./Datasets/ACS_SNAP_ENROLLMENT.csv")|>
janitor::clean_names()|>
select(
geo_id:s2201_c01_001e,
s2201_c02_009e,
s2201_c02_015e,
s2201_c02_021e:s2201_c02_028e,
s2201_c02_032e,
s2201_c02_036e:s2201_c02_038e,
s2201_c04_001e)|>
select(-ends_with("m"))|>
slice(-1)|>
mutate(geoid = str_remove(geo_id, ".*US"))|>
rename(
total_ct_households = s2201_c01_001e,
ph_with_children = s2201_c02_009e,
ph_no_children = s2201_c02_015e,
ph_below_poverty = s2201_c02_021e,
ph_above_poverty = s2201_c02_022e,
ph_disability = s2201_c02_023e,
ph_no_disability = s2201_c02_024e,
ph_white = s2201_c02_025e,
ph_black = s2201_c02_026e,
ph_aian = s2201_c02_027e,
ph_asian = s2201_c02_028e,
ph_hispanic = s2201_c02_032e,
ph_no_work = s2201_c02_036e,
ph_1_work = s2201_c02_037e,
ph_2_work = s2201_c02_038e,
ph_snap = s2201_c04_001e
)|>
mutate(across(total_ct_households:ph_snap, as.numeric),
ph_other_race = 100-ph_white-ph_black-ph_asian,
ph_non_hispanic = 100-ph_hispanic)|>
select(geoid,name,ph_snap,everything())|>
select(-geo_id)
## Rows: 2330 Columns: 458
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (458): "GEO_ID", NAME, S2201_C01_001E, S2201_C01_001M, S2201_C01_002E, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: There were 16 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(total_ct_households:ph_snap, as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 15 remaining warnings.
head(SNAP)|>
knitr::kable()|>
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), font_size = 12) %>%
kableExtra::scroll_box(width = "100%", height = "300px")
| geoid | name | ph_snap | total_ct_households | ph_with_children | ph_no_children | ph_below_poverty | ph_above_poverty | ph_disability | ph_no_disability | ph_white | ph_black | ph_aian | ph_asian | ph_hispanic | ph_no_work | ph_1_work | ph_2_work | ph_other_race | ph_non_hispanic |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36061000100 | Census Tract 1; New York County; New York | NA | 0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 36061000201 | Census Tract 2.01; New York County; New York | 47.6 | 878 | 37.8 | 62.2 | 35.0 | 65.0 | 22.2 | 77.8 | 28.4 | 6.3 | 0.0 | 19.8 | 57.5 | 43.2 | 18.8 | 38.0 | 45.5 | 42.5 |
| 36061000202 | Census Tract 2.02; New York County; New York | 42.5 | 3293 | 17.6 | 82.4 | 34.5 | 65.5 | 25.1 | 74.9 | 39.9 | 14.2 | 0.5 | 15.5 | 40.4 | 23.3 | 35.0 | 41.7 | 30.4 | 59.6 |
| 36061000500 | Census Tract 5; New York County; New York | NA | 0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 36061000600 | Census Tract 6; New York County; New York | 54.8 | 5191 | 13.1 | 86.9 | 42.3 | 57.7 | 24.1 | 75.9 | 15.3 | 7.1 | 0.3 | 53.1 | 25.3 | 37.1 | 26.9 | 36.0 | 24.5 | 74.7 |
| 36061000700 | Census Tract 7; New York County; New York | 1.0 | 4873 | 11.8 | 88.2 | 4.1 | 95.9 | 2.2 | 97.8 | 75.5 | 4.5 | 0.0 | 15.8 | 4.0 | 0.0 | 35.1 | 64.9 | 4.2 | 96.0 |
The historic redlining score was obtained from the calculation by the National Community Reinvestment Coalition(NCRC) based on the
redlining =
read_excel("./Datasets/Historic Redlining Score 2020B.xlsx")|>
janitor::clean_names()|>
mutate(fip = substr(geoid20, 1, 5))|>
filter(fip %in% c("36005", "36047", "36061", "36081", "36085"))|>
rename(geoid = geoid20)
matching the historic redlining score with snap enrollment data
redlining_snap =
redlining|>
left_join(SNAP, by = "geoid")|>
filter(is.na(ph_snap)==FALSE)
We obtained the geoid for the census tracts based on the fip code of NYC, boroughs and specific census tract codes.
nyc_healthy_store =
read_csv("./Datasets/Recognized_Shop_Healthy_Stores.csv")|>
janitor::clean_names()|>
distinct(bin, .keep_all = TRUE)|>
mutate(fipcode = case_when(
borough == "Bronx" ~ "36005",
borough == "Brooklyn" ~ "36047",
borough == "New York" ~ "36061"
),
ct_label = case_when(
census_tract_2020 < 100 ~ paste0("00", census_tract_2020, "00"),
census_tract_2020 < 1000 ~ paste0("0", census_tract_2020, "00"),
census_tract_2020 < 1200 ~ paste0(census_tract_2020, "00"),
census_tract_2020 < 10000 ~ paste0("00", census_tract_2020),
census_tract_2020 < 100000 ~ paste0("0", census_tract_2020)
))|>
select(store_name,borough,zip_code,latitude,longitude,fipcode, ct_label)|>
filter(!is.na(ct_label))|>
mutate(geoid = paste(fipcode, ct_label, sep = ""))|>
select(-fipcode, -ct_label)
## Rows: 675 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Store Name, Street Address, Borough, Neighborhood Tabulation Area ...
## dbl (10): Zip
## Code, Year Awarded, Program
## Wave, Latitude, Longitude, Com...
## lgl (1): Address
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
We obtain the count for the number of healthy food stores in the census tracts and matched that with the snap enrollment data for New York City, Bronx and Brooklyn Borough.
please note that healthy store count is only available in three boroughs so make sure you filter before the match
health_store_count =
nyc_healthy_store|>
group_by(geoid)|>
summarise(count_healthy_stores=n())
redlining_snap|>
filter(str_starts(geoid, "36005") | str_starts(geoid, "36047")| str_starts(geoid, "36061"))|>
left_join(health_store_count,by="geoid")
## # A tibble: 1,321 × 26
## geoid cbsa metro_name hrs2020 interval2020 fip name ph_snap
## <chr> <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 36061008603 35620 New York-Newark-J… 1 1 36061 Cens… 0
## 2 36061010200 35620 New York-Newark-J… 1 1 36061 Cens… 3.1
## 3 36061010601 35620 New York-Newark-J… 1 1 36061 Cens… 0
## 4 36061011401 35620 New York-Newark-J… 1 1 36061 Cens… 1.2
## 5 36061011402 35620 New York-Newark-J… 1 1 36061 Cens… 0.8
## 6 36061012000 35620 New York-Newark-J… 1 1 36061 Cens… 0.8
## 7 36061012200 35620 New York-Newark-J… 1 1 36061 Cens… 0
## 8 36061012800 35620 New York-Newark-J… 1 1 36061 Cens… 0
## 9 36061013000 35620 New York-Newark-J… 1 1 36061 Cens… 1.1
## 10 36061014200 35620 New York-Newark-J… 1 1 36061 Cens… 0
## # ℹ 1,311 more rows
## # ℹ 18 more variables: total_ct_households <dbl>, ph_with_children <dbl>,
## # ph_no_children <dbl>, ph_below_poverty <dbl>, ph_above_poverty <dbl>,
## # ph_disability <dbl>, ph_no_disability <dbl>, ph_white <dbl>,
## # ph_black <dbl>, ph_aian <dbl>, ph_asian <dbl>, ph_hispanic <dbl>,
## # ph_no_work <dbl>, ph_1_work <dbl>, ph_2_work <dbl>, ph_other_race <dbl>,
## # ph_non_hispanic <dbl>, count_healthy_stores <int>
place_crude =
read_csv("./Datasets/PLACES.csv")|>
janitor::clean_names()|>
rename(geoid = location_id)|>
select(geoid, measure:data_value, measure_id)
## Rows: 73621 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): StateAbbr, StateDesc, CountyName, DataSource, Category, Measure, D...
## dbl (10): Year, CountyFIPS, LocationName, Data_Value, Low_Confidence_Limit, ...
## lgl (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
place_cleaned=
place_crude|>
filter(measure_id %in% c("CANCER","DIABETES", "HIGHCHOL", "OBESITY"))|>
pivot_wider(
id_cols = geoid,
names_from = measure_id,
values_from = data_value
)|>
mutate(geoid = as.character(geoid))|>
janitor::clean_names()
code book for place measures
place_crude|>
distinct(measure, data_value_unit, measure_id)|>
filter(measure_id %in% c("CANCER","DIABETES", "HIGHCHOL", "OBESITY"))|>
knitr::kable()
| measure | data_value_unit | measure_id |
|---|---|---|
| Obesity among adults | % | OBESITY |
| Diagnosed diabetes among adults | % | DIABETES |
| High cholesterol among adults who have ever been screened | % | HIGHCHOL |
| Cancer (non-skin) or melanoma among adults | % | CANCER |
Comments: I think we may or may not need the following data in analysis, but if someone want to use that, we will include them in the data cleaning, or I will just delete them
poverty =
read_csv(file = "./Datasets/ACS-poverty-level.csv", skip=1)|>
janitor::clean_names()|>
mutate(geoid = str_remove(geography, ".*US"))
## New names:
## Rows: 2327 Columns: 375
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (146): Geography, Geographic Area Name, Estimate!!Total!!UNRELATED INDIV... dbl
## (228): Estimate!!Total!!Population for whom poverty status is determined... lgl
## (1): ...375
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...375`
For this data we have more detailed demographic by age division, and slightly different way of measuring race and ethnicty (Hispanic White non-Hispanic,Black non-Hispanic, Asian non-Hispanic, Some other race non-Hispanic, Non-Hispanic of two or more races). In case this will be interesting for EDA or analysis
demo =
read_excel("./Datasets/NYC_census_core_data.xlsx")|>
janitor::clean_names()|>
filter(geo_type == "CT2020")|>
rename(geoid = geo_id)